import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.cluster import KMeans
from bioinfokit.visuz import cluster
from sklearn.mixture import GaussianMixture
import statsmodels.api as sm
data = pd.read_csv("C:/Users/Nirmal/Documents/Python Scripts/mcdonalds.csv")
data
| yummy | convenient | spicy | fattening | greasy | fast | cheap | tasty | expensive | healthy | disgusting | Like | Age | VisitFrequency | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | No | Yes | No | Yes | No | Yes | Yes | No | Yes | No | No | -3 | 61 | Every three months | Female |
| 1 | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | Yes | No | No | 2 | 51 | Every three months | Female |
| 2 | No | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | No | 1 | 62 | Every three months | Female |
| 3 | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | No | No | Yes | 4 | 69 | Once a week | Female |
| 4 | No | Yes | No | Yes | Yes | Yes | Yes | No | No | Yes | No | 2 | 49 | Once a month | Male |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1448 | No | Yes | No | Yes | Yes | No | No | No | Yes | No | Yes | -5 | 47 | Once a year | Male |
| 1449 | Yes | Yes | No | Yes | No | No | Yes | Yes | No | Yes | No | 2 | 36 | Once a week | Female |
| 1450 | Yes | Yes | No | Yes | No | Yes | No | Yes | Yes | No | No | 3 | 52 | Once a month | Female |
| 1451 | Yes | Yes | No | No | No | Yes | Yes | Yes | No | Yes | No | 4 | 41 | Every three months | Male |
| 1452 | No | Yes | No | Yes | Yes | No | No | No | Yes | No | Yes | -3 | 30 | Every three months | Male |
1453 rows × 15 columns
data2 = data
data["Age"].value_counts()
Age 55 53 60 38 37 37 59 36 57 36 52 36 58 35 36 35 49 34 62 34 50 34 32 33 44 32 56 32 64 32 53 31 26 31 24 30 35 30 51 30 47 30 42 30 23 30 39 29 29 28 34 28 30 28 38 27 40 27 31 27 25 26 33 26 61 26 67 26 48 26 43 25 27 25 63 25 54 24 41 23 22 23 65 23 45 22 20 21 46 19 28 18 66 17 21 16 18 16 70 15 69 14 68 13 19 10 71 1 Name: count, dtype: int64
data["VisitFrequency"].value_counts()
VisitFrequency Once a month 439 Every three months 342 Once a year 252 Once a week 235 Never 131 More than once a week 54 Name: count, dtype: int64
data["Gender"].value_counts()
Gender Female 788 Male 665 Name: count, dtype: int64
labels = ['Male','Female']
sizes = [data.query('Gender == "Male"').Gender.count(),data.query('Gender == "Female"').Gender.count()]
colors = ['darkblue', 'cyan']
fig = go.Figure(data=[go.Pie(labels=labels, values=sizes)])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
marker=dict(colors=colors))
fig.show()
fig2 = sns.catplot(x="VisitFrequency", y="Age", hue= "Gender", data=data, palette="CMRmap", kind="strip", height=8, aspect=2)
plt.xticks(rotation='vertical')
plt.title('Visit Frequency Based on Age', fontsize=20)
plt.xlabel('Frequency of Visit')
plt.show()
C:\Users\Nirmal\AppData\Roaming\Python\Python311\site-packages\seaborn\axisgrid.py:123: UserWarning: The figure layout has changed to tight
fig3 = sns.countplot(x ='Like', data = data, palette = "Reds")
plt.title('Distribution of Customers by Preference Levels', fontsize=12)
plt.xlabel('Likeness')
plt.show()
C:\Users\Nirmal\AppData\Local\Temp\ipykernel_25864\202406410.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.
data = data.drop(["Like", "Age", "VisitFrequency", "Gender"], axis=1)
data
| yummy | convenient | spicy | fattening | greasy | fast | cheap | tasty | expensive | healthy | disgusting | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | No | Yes | No | Yes | No | Yes | Yes | No | Yes | No | No |
| 1 | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | Yes | No | No |
| 2 | No | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | No |
| 3 | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | No | No | Yes |
| 4 | No | Yes | No | Yes | Yes | Yes | Yes | No | No | Yes | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1448 | No | Yes | No | Yes | Yes | No | No | No | Yes | No | Yes |
| 1449 | Yes | Yes | No | Yes | No | No | Yes | Yes | No | Yes | No |
| 1450 | Yes | Yes | No | Yes | No | Yes | No | Yes | Yes | No | No |
| 1451 | Yes | Yes | No | No | No | Yes | Yes | Yes | No | Yes | No |
| 1452 | No | Yes | No | Yes | Yes | No | No | No | Yes | No | Yes |
1453 rows × 11 columns
data = data.apply(LabelEncoder().fit_transform)
data.head(6)
| yummy | convenient | spicy | fattening | greasy | fast | cheap | tasty | expensive | healthy | disgusting | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 |
| 2 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 0 |
| 3 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1 |
| 4 | 0 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 0 |
| 5 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
result = data.mean().round(2)
print(result)
yummy 0.55 convenient 0.91 spicy 0.09 fattening 0.87 greasy 0.53 fast 0.90 cheap 0.60 tasty 0.64 expensive 0.36 healthy 0.20 disgusting 0.24 dtype: float64
pca = PCA()
MD_pca = pca.fit_transform(data)
n_components = min(data.shape)
std_dev = np.sqrt(pca.explained_variance_)
prop_variance = pca.explained_variance_ratio_
cum_prop = np.cumsum(prop_variance)
results = pd.DataFrame({
'Standard deviation': std_dev,
'Proportion of Variance': prop_variance,
'Cumulative Proportion': cum_prop
}, index=[f'PC{i+1}' for i in range(n_components)])
print(results.round(4))
Standard deviation Proportion of Variance Cumulative Proportion PC1 0.7570 0.2994 0.2994 PC2 0.6075 0.1928 0.4922 PC3 0.5046 0.1330 0.6253 PC4 0.3988 0.0831 0.7084 PC5 0.3374 0.0595 0.7679 PC6 0.3103 0.0503 0.8182 PC7 0.2897 0.0438 0.8620 PC8 0.2751 0.0395 0.9016 PC9 0.2653 0.0368 0.9383 PC10 0.2488 0.0324 0.9707 PC11 0.2369 0.0293 1.0000
loadings = pca.components_
num_pc = pca.n_features_in_
pc_list = ["PC"+str(i) for i in list(range(1, num_pc+1))]
loadings_data = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
loadings_data['feature'] = data.columns.values
loadings_data = loadings_data.set_index('feature')
print(loadings_data.round(2))
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 feature yummy -0.48 0.36 -0.30 0.06 -0.31 0.17 -0.28 0.01 0.57 -0.11 0.05 convenient -0.16 0.02 -0.06 -0.14 0.28 -0.35 -0.06 -0.11 -0.02 -0.67 -0.54 spicy -0.01 0.02 -0.04 0.20 0.07 -0.36 0.71 0.38 0.40 -0.08 0.14 fattening 0.12 -0.03 -0.32 -0.35 -0.07 -0.41 -0.39 0.59 -0.16 -0.01 0.25 greasy 0.30 -0.06 -0.80 0.25 0.36 0.21 0.04 -0.14 -0.00 0.01 0.00 fast -0.11 -0.09 -0.06 -0.10 0.11 -0.59 -0.09 -0.63 0.17 0.24 0.34 cheap -0.34 -0.61 -0.15 0.12 -0.13 -0.10 -0.04 0.14 0.08 0.43 -0.49 tasty -0.47 0.31 -0.29 -0.00 -0.21 -0.08 0.36 -0.07 -0.64 0.08 0.02 expensive 0.33 0.60 0.02 0.07 -0.00 -0.26 -0.07 0.03 0.07 0.45 -0.49 healthy -0.21 0.08 0.19 0.76 0.29 -0.18 -0.35 0.18 -0.19 -0.04 0.16 disgusting 0.37 -0.14 -0.09 0.37 -0.73 -0.21 -0.03 -0.17 -0.07 -0.29 -0.04
pca_scores = PCA().fit_transform(data)
cluster.biplot(cscore=pca_scores, loadings=loadings, labels=data.columns.values, var1=round(pca.explained_variance_ratio_[0]*100, 2),
var2=round(pca.explained_variance_ratio_[1]*100, 2),show=True,dim=(10,5))
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, n_init=10, random_state=42)
kmeans.fit(data)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
kmeans = KMeans(n_clusters=4, random_state=0)
data['Cluster'] = kmeans.fit_predict(MD_pca)
data['PCA1'] = MD_pca[:, 0]
data['PCA2'] = MD_pca[:, 1]
centroids = data.groupby('Cluster').mean()[['PCA1', 'PCA2']].values
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=data, palette='viridis', s=50, edgecolor='k')
plt.scatter(centroids[:, 0], centroids[:, 1], s=100, c='red', label='Centroid', marker='X')
plt.legend()
plt.title('Customer Clusters')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.show()
C:\Users\Nirmal\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
data2["Like"].value_counts()
Like 3 229 2 187 0 169 4 160 1 152 -5 152 5 143 -3 73 -4 71 -2 59 -1 58 Name: count, dtype: int64
categorical_columns = data2[['yummy', 'convenient', 'spicy', 'fattening', 'greasy', 'fast',
'cheap', 'tasty', 'expensive', 'healthy', 'disgusting']]
label_encoder = LabelEncoder()
for col in categorical_columns:
data2[col] = label_encoder.fit_transform(data2[col])
print(data2.head(6))
yummy convenient spicy fattening greasy fast cheap tasty expensive \ 0 0 1 0 1 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 2 0 1 1 1 1 1 0 1 1 3 1 1 0 1 1 1 1 1 0 4 0 1 0 1 1 1 1 0 0 5 1 1 0 1 0 1 1 1 0 healthy disgusting Like Age VisitFrequency Gender 0 0 0 -3 61 Every three months Female 1 0 0 2 51 Every three months Female 2 1 0 1 62 Every three months Female 3 0 1 4 69 Once a week Female 4 1 0 2 49 Once a month Male 5 0 0 2 55 Every three months Male
data2 = data2.drop(["Age", "VisitFrequency", "Gender"], axis = 1)
data2
| yummy | convenient | spicy | fattening | greasy | fast | cheap | tasty | expensive | healthy | disgusting | Like | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | -3 |
| 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 2 |
| 2 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 1 |
| 3 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 4 |
| 4 | 0 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1448 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | -5 |
| 1449 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 2 |
| 1450 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 3 |
| 1451 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 4 |
| 1452 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | -3 |
1453 rows × 12 columns
perception_vars = data2.columns[0:11]
formula = 'Like ~ ' + ' + '.join(perception_vars)
print(formula)
Like ~ yummy + convenient + spicy + fattening + greasy + fast + cheap + tasty + expensive + healthy + disgusting
X = data2[perception_vars]
y = data2['Like']
reg_mix = GaussianMixture(n_components=2, n_init = 10, random_state=1234)
cluster_labels = reg_mix.fit_predict(np.column_stack((X, y.values.reshape(-1, 1))))
print("Cluster sizes:")
print(np.bincount(cluster_labels))
print(f"Convergence after {reg_mix.n_iter_} iterations")
Cluster sizes: [ 393 1060] Convergence after 8 iterations
for cluster in [0, 1]:
print(f"\nCluster {cluster + 1}")
X_cluster = sm.add_constant(X[cluster_labels == cluster])
y_cluster = y[cluster_labels == cluster]
model = sm.OLS(y_cluster, X_cluster).fit()
print(model.summary())
Cluster 1
OLS Regression Results
==============================================================================
Dep. Variable: Like R-squared: 0.503
Model: OLS Adj. R-squared: 0.489
Method: Least Squares F-statistic: 35.06
Date: Mon, 26 Aug 2024 Prob (F-statistic): 2.45e-51
Time: 20:11:15 Log-Likelihood: -853.81
No. Observations: 393 AIC: 1732.
Df Residuals: 381 BIC: 1779.
Df Model: 11
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -2.6131 0.516 -5.064 0.000 -3.628 -1.599
yummy 3.2997 0.375 8.807 0.000 2.563 4.036
convenient 0.9480 0.285 3.325 0.001 0.387 1.509
spicy -0.0740 0.384 -0.193 0.847 -0.829 0.680
fattening 0.0518 0.475 0.109 0.913 -0.882 0.986
greasy 0.0906 0.289 0.313 0.754 -0.479 0.660
fast 0.4125 0.315 1.309 0.191 -0.207 1.032
cheap -0.0054 0.315 -0.017 0.986 -0.625 0.614
tasty 1.3688 0.335 4.089 0.000 0.711 2.027
expensive -0.0361 0.309 -0.117 0.907 -0.644 0.571
healthy 0.3812 0.447 0.852 0.395 -0.498 1.261
disgusting -1.9545 0.466 -4.195 0.000 -2.871 -1.038
==============================================================================
Omnibus: 3.824 Durbin-Watson: 1.982
Prob(Omnibus): 0.148 Jarque-Bera (JB): 3.569
Skew: 0.220 Prob(JB): 0.168
Kurtosis: 3.157 Cond. No. 13.1
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Cluster 2
OLS Regression Results
==============================================================================
Dep. Variable: Like R-squared: 0.414
Model: OLS Adj. R-squared: 0.409
Method: Least Squares F-statistic: 82.31
Date: Mon, 26 Aug 2024 Prob (F-statistic): 2.49e-115
Time: 20:11:15 Log-Likelihood: -2127.0
No. Observations: 1060 AIC: 4274.
Df Residuals: 1050 BIC: 4324.
Df Model: 9
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
yummy 2.0706 0.150 13.770 0.000 1.776 2.366
convenient -0.3373 0.302 -1.116 0.265 -0.930 0.256
spicy -0.4944 0.197 -2.505 0.012 -0.882 -0.107
fattening -0.5010 0.172 -2.912 0.004 -0.839 -0.163
greasy -0.3650 0.118 -3.089 0.002 -0.597 -0.133
fast 0.2400 0.222 1.082 0.279 -0.195 0.675
cheap 0.0656 0.174 0.378 0.705 -0.275 0.406
tasty 1.3624 0.166 8.202 0.000 1.036 1.688
expensive 0.0112 0.181 0.062 0.951 -0.344 0.366
healthy 0.4613 0.143 3.230 0.001 0.181 0.742
disgusting 0 0 nan nan 0 0
==============================================================================
Omnibus: 70.223 Durbin-Watson: 2.009
Prob(Omnibus): 0.000 Jarque-Bera (JB): 84.166
Skew: -0.634 Prob(JB): 5.29e-19
Kurtosis: 3.548 Cond. No. inf
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 0. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
C:\ProgramData\anaconda3\Lib\site-packages\statsmodels\regression\linear_model.py:1965: RuntimeWarning: divide by zero encountered in scalar divide
comp1_coef = [3.2997, 0.9480, -0.0740, 0.0518, 0.0906, 0.4125, -0.0054, 1.3688, -0.0361, 0.3812, -1.9545]
comp1_se = [ 0.375, 0.285, 0.384, 0.475, 0.289, 0.315, 0.315, 0.335, 0.309, 0.447, 0.466 ]
comp2_coef = [2.0706, -0.3373, -0.4944, -0.5010, -0.3650, 0.2400, 0.0656, 1.3624, 0.0112, 0.4613, 0]
comp2_se = [ 0.150, 0.302, 0.197, 0.172, 0.118, 0.222, 0.174, 0.166, 0.181, 0.143, 0]
variables = ['yummy', 'convenient', 'spicy', 'fattening', 'greasy',
'fast', 'cheap', 'tasty', 'expensive', 'healthy', 'disgusting']
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8), sharey=True)
fig.suptitle('Coefficient Estimates for Segments 1 and 2')
def plot_coef(ax, coef, se, title):
y_pos = np.arange(len(variables))
ax.barh(y_pos, coef, xerr=se, align='center', capsize=5, color='skyblue', edgecolor='black')
ax.axvline(x=0, color='k', linestyle='--')
ax.set_yticks(y_pos)
ax.set_yticklabels(variables)
ax.set_title(title)
ax.set_xlim(-6, 6)
plot_coef(ax1, comp1_coef, comp1_se, 'Segment 1')
plot_coef(ax2, comp2_coef, comp2_se, 'Segment 2')
ax1.invert_yaxis()
plt.tight_layout()
plt.show()